import matplotlib.pyplot as plt
import plotly as ply
import plotly.graph_objects as go
import pandas as pd
import numpy as np
import seaborn as sns
import datetime
import missingno
df = pd.read_csv("../data/application_data.csv")
df
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 1 | 100003 | 0 | Cash loans | F | N | N | 0 | 270000.0 | 1293502.5 | 35698.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 3 | 100006 | 0 | Cash loans | F | N | Y | 0 | 135000.0 | 312682.5 | 29686.5 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 307506 | 456251 | 0 | Cash loans | M | N | N | 0 | 157500.0 | 254700.0 | 27558.0 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 307507 | 456252 | 0 | Cash loans | F | N | Y | 0 | 72000.0 | 269550.0 | 12001.5 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 307508 | 456253 | 0 | Cash loans | F | N | Y | 0 | 153000.0 | 677664.0 | 29979.0 | ... | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 307509 | 456254 | 1 | Cash loans | F | N | Y | 0 | 171000.0 | 370107.0 | 20205.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 307510 | 456255 | 0 | Cash loans | F | N | N | 0 | 157500.0 | 675000.0 | 49117.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 1.0 |
307511 rows × 122 columns
data_descriptions = pd.read_csv("../data/columns_description.csv", encoding='latin1')
data_descriptions.head()
| Unnamed: 0 | Table | Row | Description | Special | |
|---|---|---|---|---|---|
| 0 | 1 | application_data | SK_ID_CURR | ID of loan in our sample | NaN |
| 1 | 2 | application_data | TARGET | Target variable (1 - client with payment diffi... | NaN |
| 2 | 5 | application_data | NAME_CONTRACT_TYPE | Identification if loan is cash or revolving | NaN |
| 3 | 6 | application_data | CODE_GENDER | Gender of the client | NaN |
| 4 | 7 | application_data | FLAG_OWN_CAR | Flag if the client owns a car | NaN |
my_vars = pd.read_csv("../data/my_vars.csv")
my_vars_vec = my_vars['Row'].to_list()
my_df = df[my_vars_vec].copy()
my_df.head()
| AMT_ANNUITY | AMT_GOODS_PRICE | OWN_CAR_AGE | CNT_FAM_MEMBERS | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 24700.5 | 351000.0 | NaN | 1.0 | 0.083037 | 0.262949 | 0.139376 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 |
| 1 | 35698.5 | 1129500.0 | NaN | 2.0 | 0.311267 | 0.622246 | NaN | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 |
| 2 | 6750.0 | 135000.0 | 26.0 | 1.0 | NaN | 0.555912 | 0.729567 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 3 | 29686.5 | 297000.0 | NaN | 2.0 | NaN | 0.650442 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 4 | 21865.5 | 513000.0 | NaN | 1.0 | NaN | 0.322738 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
aux = my_vars[my_vars['Category'] == 1]
my_vars_vec = aux['Row'].to_list()
first_df = df[my_vars_vec].copy()
aux = my_vars[my_vars['Category'] == 2]
my_vars_vec = aux['Row'].to_list()
second_df = df[my_vars_vec].copy()
aux = my_vars[my_vars['Category'] == 3]
my_vars_vec = aux['Row'].to_list()
third_df = df[my_vars_vec].copy()
aux = my_vars[my_vars['Category'] == 4]
my_vars_vec = aux['Row'].to_list()
fourth_df = df[my_vars_vec].copy()
aux = my_vars[my_vars['Category'] == 5]
my_vars_vec = aux['Row'].to_list()
fifth_df = df[my_vars_vec].copy()
%matplotlib inline
missingno.bar(df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.matrix(df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.heatmap(df)
<AxesSubplot: >
%matplotlib inline
missingno.heatmap(df[['AMT_REQ_CREDIT_BUREAU_DAY','AMT_REQ_CREDIT_BUREAU_MON','AMT_REQ_CREDIT_BUREAU_YEAR','EXT_SOURCE_3']])
<AxesSubplot: >
%matplotlib inline
missingno.heatmap(df[['AMT_ANNUITY','AMT_GOODS_PRICE','NAME_TYPE_SUITE','OWN_CAR_AGE','OCCUPATION_TYPE','CNT_FAM_MEMBERS','EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']])
<AxesSubplot: >
%matplotlib inline
missingno.heatmap(df[['OBS_30_CNT_SOCIAL_CIRCLE','DEF_30_CNT_SOCIAL_CIRCLE','OBS_60_CNT_SOCIAL_CIRCLE','DEF_60_CNT_SOCIAL_CIRCLE']])
<AxesSubplot: >
%matplotlib inline
missingno.bar(my_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.matrix(my_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.heatmap(my_df)
<AxesSubplot: >
%matplotlib inline
missingno.bar(first_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.matrix(first_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
print(data_descriptions[data_descriptions['Row'] == 'AMT_ANNUITY']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'AMT_ANNUITY']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'AMT_GOODS_PRICE']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'AMT_GOODS_PRICE']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'NAME_TYPE_SUITE']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'NAME_TYPE_SUITE']['Special'].to_list())
['Loan annuity'] [nan] ['For consumer loans it is the price of the goods for which the loan is given'] [nan] ['Who was accompanying client when he was applying for the loan'] [nan]
df_helper = df[['AMT_ANNUITY']].copy()
sns.displot(data=df_helper, x="AMT_ANNUITY", kde=True)
<seaborn.axisgrid.FacetGrid at 0x7efcb0921db0>
df_helper = df[['AMT_GOODS_PRICE']].copy()
sns.displot(data=df_helper, x="AMT_GOODS_PRICE", kde=False)
<seaborn.axisgrid.FacetGrid at 0x7efcb016b970>
df_helper = df[['NAME_TYPE_SUITE']].copy()
df_helper.drop_duplicates(inplace=True)
df_helper
| NAME_TYPE_SUITE | |
|---|---|
| 0 | Unaccompanied |
| 1 | Family |
| 5 | Spouse, partner |
| 8 | Children |
| 18 | Other_A |
| 64 | NaN |
| 174 | Other_B |
| 416 | Group of people |
pd.set_option('display.max_columns', None)
df[df['AMT_ANNUITY'].isna()]
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | AMT_GOODS_PRICE | NAME_TYPE_SUITE | NAME_INCOME_TYPE | NAME_EDUCATION_TYPE | NAME_FAMILY_STATUS | NAME_HOUSING_TYPE | REGION_POPULATION_RELATIVE | DAYS_BIRTH | DAYS_EMPLOYED | DAYS_REGISTRATION | DAYS_ID_PUBLISH | OWN_CAR_AGE | FLAG_MOBIL | FLAG_EMP_PHONE | FLAG_WORK_PHONE | FLAG_CONT_MOBILE | FLAG_PHONE | FLAG_EMAIL | OCCUPATION_TYPE | CNT_FAM_MEMBERS | REGION_RATING_CLIENT | REGION_RATING_CLIENT_W_CITY | WEEKDAY_APPR_PROCESS_START | HOUR_APPR_PROCESS_START | REG_REGION_NOT_LIVE_REGION | REG_REGION_NOT_WORK_REGION | LIVE_REGION_NOT_WORK_REGION | REG_CITY_NOT_LIVE_CITY | REG_CITY_NOT_WORK_CITY | LIVE_CITY_NOT_WORK_CITY | ORGANIZATION_TYPE | EXT_SOURCE_1 | EXT_SOURCE_2 | EXT_SOURCE_3 | APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLOORSMIN_AVG | LANDAREA_AVG | LIVINGAPARTMENTS_AVG | LIVINGAREA_AVG | NONLIVINGAPARTMENTS_AVG | NONLIVINGAREA_AVG | APARTMENTS_MODE | BASEMENTAREA_MODE | YEARS_BEGINEXPLUATATION_MODE | YEARS_BUILD_MODE | COMMONAREA_MODE | ELEVATORS_MODE | ENTRANCES_MODE | FLOORSMAX_MODE | FLOORSMIN_MODE | LANDAREA_MODE | LIVINGAPARTMENTS_MODE | LIVINGAREA_MODE | NONLIVINGAPARTMENTS_MODE | NONLIVINGAREA_MODE | APARTMENTS_MEDI | BASEMENTAREA_MEDI | YEARS_BEGINEXPLUATATION_MEDI | YEARS_BUILD_MEDI | COMMONAREA_MEDI | ELEVATORS_MEDI | ENTRANCES_MEDI | FLOORSMAX_MEDI | FLOORSMIN_MEDI | LANDAREA_MEDI | LIVINGAPARTMENTS_MEDI | LIVINGAREA_MEDI | NONLIVINGAPARTMENTS_MEDI | NONLIVINGAREA_MEDI | FONDKAPREMONT_MODE | HOUSETYPE_MODE | TOTALAREA_MODE | WALLSMATERIAL_MODE | EMERGENCYSTATE_MODE | OBS_30_CNT_SOCIAL_CIRCLE | DEF_30_CNT_SOCIAL_CIRCLE | OBS_60_CNT_SOCIAL_CIRCLE | DEF_60_CNT_SOCIAL_CIRCLE | DAYS_LAST_PHONE_CHANGE | FLAG_DOCUMENT_2 | FLAG_DOCUMENT_3 | FLAG_DOCUMENT_4 | FLAG_DOCUMENT_5 | FLAG_DOCUMENT_6 | FLAG_DOCUMENT_7 | FLAG_DOCUMENT_8 | FLAG_DOCUMENT_9 | FLAG_DOCUMENT_10 | FLAG_DOCUMENT_11 | FLAG_DOCUMENT_12 | FLAG_DOCUMENT_13 | FLAG_DOCUMENT_14 | FLAG_DOCUMENT_15 | FLAG_DOCUMENT_16 | FLAG_DOCUMENT_17 | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 47531 | 155054 | 0 | Cash loans | M | N | N | 0 | 180000.0 | 450000.0 | NaN | 450000.0 | Unaccompanied | Commercial associate | Incomplete higher | Single / not married | House / apartment | 0.026392 | -10668 | -2523 | -4946.0 | -3238 | NaN | 1 | 1 | 1 | 1 | 1 | 0 | High skill tech staff | 1.0 | 2 | 2 | WEDNESDAY | 13 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | NaN | 0.626896 | 0.372334 | 0.0124 | NaN | 0.9662 | NaN | NaN | 0.00 | 0.0345 | 0.0417 | NaN | NaN | NaN | 0.0115 | NaN | 0.0000 | 0.0126 | NaN | 0.9662 | NaN | NaN | 0.0000 | 0.0345 | 0.0417 | NaN | NaN | NaN | 0.0119 | NaN | 0.0000 | 0.0125 | NaN | 0.9662 | NaN | NaN | 0.00 | 0.0345 | 0.0417 | NaN | NaN | NaN | 0.0117 | NaN | 0.0000 | NaN | block of flats | 0.0090 | Stone, brick | No | 1.0 | 0.0 | 1.0 | 0.0 | -2.0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| 50035 | 157917 | 0 | Cash loans | F | N | N | 0 | 94500.0 | 450000.0 | NaN | 450000.0 | Unaccompanied | Working | Lower secondary | Civil marriage | House / apartment | 0.035792 | -9027 | -1270 | -3640.0 | -741 | NaN | 1 | 1 | 1 | 1 | 0 | 0 | Laborers | 2.0 | 2 | 2 | MONDAY | 20 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 1 | NaN | 0.727274 | 0.468660 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -706.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 51594 | 159744 | 0 | Cash loans | F | N | N | 0 | 202500.0 | 539100.0 | NaN | 450000.0 | Unaccompanied | Working | Secondary / secondary special | Married | House / apartment | 0.046220 | -17376 | -3950 | -11524.0 | -831 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Cooking staff | 2.0 | 1 | 1 | WEDNESDAY | 15 | 0 | 0 | 0 | 1 | 1 | 1 | Self-employed | 0.885980 | 0.738370 | 0.452534 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1.0 | 0.0 | 1.0 | 0.0 | -199.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
| 55025 | 163757 | 0 | Cash loans | F | N | N | 0 | 162000.0 | 296280.0 | NaN | 225000.0 | Unaccompanied | State servant | Higher education | Married | House / apartment | 0.035792 | -11329 | -2040 | -3195.0 | -3069 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Core staff | 2.0 | 2 | 2 | FRIDAY | 13 | 0 | 0 | 0 | 1 | 1 | 1 | Government | 0.336803 | 0.566316 | 0.220095 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -2841.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 1.0 | 0.0 | 4.0 |
| 59934 | 169487 | 0 | Cash loans | M | Y | N | 0 | 202500.0 | 360000.0 | NaN | 360000.0 | Unaccompanied | Working | Secondary / secondary special | Single / not married | House / apartment | 0.046220 | -19762 | -2498 | -11285.0 | -3305 | 4.0 | 1 | 1 | 1 | 1 | 1 | 0 | Laborers | 1.0 | 1 | 1 | FRIDAY | 15 | 0 | 1 | 1 | 0 | 1 | 1 | Other | 0.647350 | 0.583947 | 0.177704 | 0.0856 | NaN | 0.9841 | NaN | NaN | 0.08 | 0.0345 | 0.4583 | NaN | NaN | NaN | 0.0548 | NaN | 0.1139 | 0.0872 | NaN | 0.9841 | NaN | NaN | 0.0806 | 0.0345 | 0.4583 | NaN | NaN | NaN | 0.0571 | NaN | 0.1206 | 0.0864 | NaN | 0.9841 | NaN | NaN | 0.08 | 0.0345 | 0.4583 | NaN | NaN | NaN | 0.0558 | NaN | 0.1163 | NaN | block of flats | 0.0679 | Stone, brick | No | 0.0 | 0.0 | 0.0 | 0.0 | -743.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 6.0 |
| 75873 | 187985 | 0 | Cash loans | M | Y | N | 0 | 144000.0 | 219249.0 | NaN | 166500.0 | Unaccompanied | Working | Higher education | Single / not married | Rented apartment | 0.022800 | -20831 | -2450 | -771.0 | -4203 | 8.0 | 1 | 1 | 0 | 1 | 0 | 0 | Drivers | 1.0 | 2 | 2 | FRIDAY | 15 | 0 | 0 | 0 | 0 | 0 | 0 | Business Entity Type 3 | 0.339389 | 0.667564 | 0.425893 | 0.0928 | 0.0719 | 0.9831 | 0.7688 | 0.0113 | 0.00 | 0.2069 | 0.1667 | 0.0417 | 0.6932 | 0.0748 | 0.0865 | 0.0039 | 0.0031 | 0.0945 | 0.0746 | 0.9831 | 0.7779 | 0.0114 | 0.0000 | 0.2069 | 0.1667 | 0.0417 | 0.7091 | 0.0817 | 0.0901 | 0.0039 | 0.0033 | 0.0937 | 0.0719 | 0.9831 | 0.7719 | 0.0113 | 0.00 | 0.2069 | 0.1667 | 0.0417 | 0.7053 | 0.0761 | 0.0880 | 0.0039 | 0.0032 | not specified | block of flats | 0.0687 | Panel | No | 0.0 | 0.0 | 0.0 | 0.0 | -1986.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 11.0 |
| 89343 | 203726 | 0 | Cash loans | F | Y | N | 0 | 90000.0 | 157500.0 | NaN | 157500.0 | Unaccompanied | State servant | Secondary / secondary special | Married | House / apartment | 0.015221 | -12134 | -3721 | -858.0 | -591 | 2.0 | 1 | 1 | 0 | 1 | 1 | 0 | Medicine staff | 2.0 | 2 | 2 | WEDNESDAY | 11 | 0 | 0 | 0 | 0 | 0 | 0 | Medicine | 0.682126 | 0.154811 | 0.297087 | 0.1227 | NaN | 0.9781 | NaN | NaN | 0.00 | 0.2759 | 0.1667 | NaN | NaN | NaN | 0.1131 | NaN | 0.0044 | 0.1250 | NaN | 0.9782 | NaN | NaN | 0.0000 | 0.2759 | 0.1667 | NaN | NaN | NaN | 0.1178 | NaN | 0.0046 | 0.1239 | NaN | 0.9781 | NaN | NaN | 0.00 | 0.2759 | 0.1667 | NaN | NaN | NaN | 0.1151 | NaN | 0.0045 | NaN | block of flats | 0.1218 | Panel | No | 0.0 | 0.0 | 0.0 | 0.0 | -348.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 |
| 123872 | 243648 | 0 | Cash loans | F | N | Y | 0 | 202500.0 | 929088.0 | NaN | 720000.0 | Unaccompanied | Working | Secondary / secondary special | Civil marriage | House / apartment | 0.019689 | -13902 | -3540 | -168.0 | -4250 | NaN | 1 | 1 | 0 | 1 | 0 | 1 | Secretaries | 2.0 | 2 | 2 | SATURDAY | 12 | 0 | 0 | 0 | 0 | 0 | 0 | Self-employed | 0.400723 | 0.510934 | 0.581484 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -1331.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 5.0 |
| 207186 | 340147 | 0 | Cash loans | M | N | N | 0 | 171000.0 | 486000.0 | NaN | 486000.0 | Unaccompanied | Commercial associate | Higher education | Married | House / apartment | 0.018634 | -10151 | -472 | -10127.0 | -2787 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Security staff | 2.0 | 2 | 2 | WEDNESDAY | 13 | 1 | 1 | 1 | 1 | 1 | 1 | Security | NaN | 0.706306 | 0.391055 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -295.0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 2.0 |
| 227939 | 364022 | 0 | Cash loans | F | N | Y | 0 | 315000.0 | 628069.5 | NaN | 499500.0 | Unaccompanied | Commercial associate | Higher education | Married | Municipal apartment | 0.046220 | -16344 | -1478 | -482.0 | -4192 | NaN | 1 | 1 | 0 | 1 | 0 | 0 | Sales staff | 2.0 | 1 | 1 | WEDNESDAY | 11 | 0 | 1 | 1 | 0 | 0 | 0 | Business Entity Type 3 | NaN | 0.618698 | 0.431192 | 0.1918 | 0.1735 | 0.9866 | 0.8164 | 0.0464 | 0.12 | 0.1034 | 0.3333 | 0.0417 | 0.1784 | 0.1563 | 0.1728 | 0.0000 | 0.0000 | 0.1954 | 0.1800 | 0.9866 | 0.8236 | 0.0468 | 0.1208 | 0.1034 | 0.3333 | 0.0417 | 0.1825 | 0.1708 | 0.1800 | 0.0000 | 0.0000 | 0.1936 | 0.1735 | 0.9866 | 0.8189 | 0.0467 | 0.12 | 0.1034 | 0.3333 | 0.0417 | 0.1815 | 0.1590 | 0.1759 | 0.0000 | 0.0000 | reg oper account | block of flats | 0.1613 | Stone, brick | No | 1.0 | 0.0 | 1.0 | 0.0 | -1255.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 1.0 | 0.0 | 0.0 | 2.0 |
| 239329 | 377174 | 0 | Cash loans | F | N | Y | 0 | 157500.0 | 792000.0 | NaN | 792000.0 | Family | State servant | Secondary / secondary special | Married | House / apartment | 0.026392 | -17661 | -2885 | -8594.0 | -1221 | NaN | 1 | 1 | 0 | 1 | 1 | 0 | Managers | 2.0 | 2 | 2 | SATURDAY | 14 | 0 | 0 | 0 | 0 | 0 | 0 | Other | 0.775404 | 0.656305 | 0.651260 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0.0 | 0.0 | 0.0 | 0.0 | -322.0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| 241835 | 379997 | 0 | Cash loans | F | N | N | 0 | 315000.0 | 1483231.5 | NaN | 1354500.0 | Unaccompanied | Working | Higher education | Married | House / apartment | 0.072508 | -15072 | -152 | -7576.0 | -4389 | NaN | 1 | 1 | 1 | 1 | 1 | 0 | Accountants | 2.0 | 1 | 1 | WEDNESDAY | 15 | 0 | 1 | 1 | 0 | 1 | 1 | Self-employed | 0.593394 | 0.267269 | NaN | 0.3691 | 0.1891 | 0.9806 | 0.7348 | 0.2590 | 0.52 | 0.2241 | 0.5417 | 0.5833 | 0.0000 | 0.3009 | 0.3756 | 0.0000 | 0.2002 | 0.2216 | 0.1435 | 0.9796 | 0.7321 | 0.1590 | 0.3222 | 0.1379 | 0.5417 | 0.5833 | 0.0000 | 0.1938 | 0.2268 | 0.0000 | 0.0720 | 0.3726 | 0.1891 | 0.9806 | 0.7383 | 0.2607 | 0.52 | 0.2241 | 0.5417 | 0.5833 | 0.0000 | 0.3061 | 0.3824 | 0.0000 | 0.2044 | reg oper spec account | block of flats | 0.1860 | Stone, brick | No | 0.0 | 0.0 | 0.0 | 0.0 | -504.0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
pd.reset_option('max_columns')
Ci sono 12 entrate che hanno AMT_ANNUITY a NaN, sono numeri, e non sembra chiara una scelta da sostituire al NaN, soprattutto in vista della forma dell'istogramma, non riesco neanche a immaginarmi spiegazioni sensate del perché questo dato possa mancare se non errore di inserimento. Non riesco a trovare colonne del dataset che sembrano essere collegate alla mancanza di questo dato.
df_helper = df[df['AMT_GOODS_PRICE'].isna()].copy()
df_helper = df_helper[['AMT_GOODS_PRICE', 'NAME_TYPE_SUITE']].drop_duplicates()
df_helper
| AMT_GOODS_PRICE | NAME_TYPE_SUITE | |
|---|---|---|
| 724 | NaN | NaN |
df_helper = df[df['AMT_GOODS_PRICE'].isna()].copy()
df_helper = df_helper[['AMT_GOODS_PRICE', 'NAME_TYPE_SUITE']]
df_helper
| AMT_GOODS_PRICE | NAME_TYPE_SUITE | |
|---|---|---|
| 724 | NaN | NaN |
| 5937 | NaN | NaN |
| 6425 | NaN | NaN |
| 6703 | NaN | NaN |
| 7647 | NaN | NaN |
| ... | ... | ... |
| 304678 | NaN | NaN |
| 304784 | NaN | NaN |
| 305833 | NaN | NaN |
| 306126 | NaN | NaN |
| 306273 | NaN | NaN |
278 rows × 2 columns
df_helper = df[df['NAME_TYPE_SUITE'].isna()].copy()
df_helper = df_helper[['AMT_GOODS_PRICE', 'NAME_TYPE_SUITE']]
df_helper
| AMT_GOODS_PRICE | NAME_TYPE_SUITE | |
|---|---|---|
| 64 | 315000.0 | NaN |
| 724 | NaN | NaN |
| 1199 | 477000.0 | NaN |
| 1669 | 832500.0 | NaN |
| 1862 | 243000.0 | NaN |
| ... | ... | ... |
| 306273 | NaN | NaN |
| 306544 | 894825.0 | NaN |
| 306615 | 225000.0 | NaN |
| 306636 | 585000.0 | NaN |
| 306725 | 675000.0 | NaN |
1292 rows × 2 columns
In tutte le entrate di AMT_GOODS_PRICE che contengono un NaN (278 righe) abbiamo che non è segnato neanche la persona accompagnatrice, che però è un dato mancante in 1292 righe. Infatti il viceversa non vale. Non sembra chiaro il perché ci possa essere un collegamento tra le due cose, ma era già evidenziato dall'heatmap
%matplotlib inline
missingno.bar(second_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
second_df['FLAG_OWN_CAR'] = df['FLAG_OWN_CAR'].copy()
second_df['FLAG_OWN_CAR'].replace(to_replace='N',value=np.nan, inplace=True)
missingno.matrix(second_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
df_helper = df[['OWN_CAR_AGE']].copy()
sns.displot(data=df_helper, x="OWN_CAR_AGE", kde=True)
<seaborn.axisgrid.FacetGrid at 0x7efcb041c460>
df_helper = df[['DAYS_BIRTH']].copy()
df_helper['YEARS_BIRTH_POSITIVE'] = df_helper.apply(lambda vec : np.abs(vec[:])/365)
sns.displot(data=df_helper, x="YEARS_BIRTH_POSITIVE", kde=True)
<seaborn.axisgrid.FacetGrid at 0x7efcb02dfd00>
print(data_descriptions[data_descriptions['Row'] == 'OWN_CAR_AGE']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'OWN_CAR_AGE']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'FLAG_OWN_CAR']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'FLAG_OWN_CAR']['Special'].to_list())
["Age of client's car"] [nan] ['Flag if the client owns a car'] [nan]
helper_df = df[df['OWN_CAR_AGE'].isna()].copy()
print(len(helper_df))
helper_df = helper_df[['FLAG_OWN_CAR','OWN_CAR_AGE']]
helper_df.drop_duplicates(inplace=True)
helper_df
202929
| FLAG_OWN_CAR | OWN_CAR_AGE | |
|---|---|---|
| 0 | N | NaN |
| 30897 | Y | NaN |
helper_df = df[['FLAG_OWN_CAR','OWN_CAR_AGE']].copy()
helper_df = helper_df[(helper_df['FLAG_OWN_CAR'] == 'Y') & (helper_df['OWN_CAR_AGE'].isna())]
helper_df
| FLAG_OWN_CAR | OWN_CAR_AGE | |
|---|---|---|
| 30897 | Y | NaN |
| 181231 | Y | NaN |
| 217549 | Y | NaN |
| 229867 | Y | NaN |
| 236868 | Y | NaN |
helper_df = df[['FLAG_OWN_CAR','OWN_CAR_AGE']].copy()
helper_df = helper_df[(helper_df['FLAG_OWN_CAR'] == 'N') & (helper_df['OWN_CAR_AGE'].notna())]
helper_df
| FLAG_OWN_CAR | OWN_CAR_AGE |
|---|
Per quanto riguarda OWN_CAR_AGE, abbiamo 202929 NaN che corrispondono quasi interamente a casi in cui FLAG_OWN_CAR è posto a N, quindi persone che non hanno una automobile. Tuttavia in 5 casi risulta FLAG_OWN_CAR = Y e comunque OWN_CAR_AGE = None, questi sono casi in cui effettivamente il valore manca. Il caso contrario, ovvero FLAG_OWN_CAR pari a N e OWN_CAR_AGE non nullo, invece, non capita mai.
%matplotlib inline
missingno.bar(third_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.matrix(third_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
print(data_descriptions[data_descriptions['Row'] == 'CNT_FAM_MEMBERS']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'CNT_FAM_MEMBERS']['Special'].to_list())
['How many family members does client have'] [nan]
df[df['CNT_FAM_MEMBERS'].isna()]
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 41982 | 148605 | 0 | Revolving loans | M | N | Y | 0 | 450000.0 | 675000.0 | 33750.0 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 187348 | 317181 | 0 | Revolving loans | F | N | Y | 0 | 202500.0 | 585000.0 | 29250.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
2 rows × 122 columns
df[df['CNT_FAM_MEMBERS'] == 0]
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR |
|---|
0 rows × 122 columns
df[df['CNT_FAM_MEMBERS'] == 1]
| SK_ID_CURR | TARGET | NAME_CONTRACT_TYPE | CODE_GENDER | FLAG_OWN_CAR | FLAG_OWN_REALTY | CNT_CHILDREN | AMT_INCOME_TOTAL | AMT_CREDIT | AMT_ANNUITY | ... | FLAG_DOCUMENT_18 | FLAG_DOCUMENT_19 | FLAG_DOCUMENT_20 | FLAG_DOCUMENT_21 | AMT_REQ_CREDIT_BUREAU_HOUR | AMT_REQ_CREDIT_BUREAU_DAY | AMT_REQ_CREDIT_BUREAU_WEEK | AMT_REQ_CREDIT_BUREAU_MON | AMT_REQ_CREDIT_BUREAU_QRT | AMT_REQ_CREDIT_BUREAU_YEAR | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 100002 | 1 | Cash loans | M | N | Y | 0 | 202500.0 | 406597.5 | 24700.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| 2 | 100004 | 0 | Revolving loans | M | Y | Y | 0 | 67500.0 | 135000.0 | 6750.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 4 | 100007 | 0 | Cash loans | M | N | Y | 0 | 121500.0 | 513000.0 | 21865.5 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 |
| 9 | 100012 | 0 | Revolving loans | M | N | Y | 0 | 135000.0 | 405000.0 | 20250.0 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 15 | 100019 | 0 | Cash loans | M | Y | Y | 0 | 157500.0 | 299772.0 | 20160.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 307504 | 456248 | 0 | Cash loans | F | N | Y | 0 | 153000.0 | 331920.0 | 16096.5 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 307505 | 456249 | 0 | Cash loans | F | N | Y | 0 | 112500.0 | 225000.0 | 22050.0 | ... | 0 | 0 | 0 | 0 | 0.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 |
| 307506 | 456251 | 0 | Cash loans | M | N | N | 0 | 157500.0 | 254700.0 | 27558.0 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 307507 | 456252 | 0 | Cash loans | F | N | Y | 0 | 72000.0 | 269550.0 | 12001.5 | ... | 0 | 0 | 0 | 0 | NaN | NaN | NaN | NaN | NaN | NaN |
| 307508 | 456253 | 0 | Cash loans | F | N | Y | 0 | 153000.0 | 677664.0 | 29979.0 | ... | 0 | 0 | 0 | 0 | 1.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 |
67847 rows × 122 columns
Ci sono esattamente due casi in cui CNT_FAM_MEMBERS risulta pari a NaN, sembrano dei chiari errori di inserimento
%matplotlib inline
missingno.bar(fourth_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.matrix(fourth_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
print(data_descriptions[data_descriptions['Row'] == 'EXT_SOURCE_1']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'EXT_SOURCE_1']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'EXT_SOURCE_2']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'EXT_SOURCE_2']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'EXT_SOURCE_3']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'EXT_SOURCE_3']['Special'].to_list())
['Normalized score from external data source'] ['normalized'] ['Normalized score from external data source'] ['normalized'] ['Normalized score from external data source'] ['normalized']
sns.pairplot(df[['EXT_SOURCE_1','EXT_SOURCE_2','EXT_SOURCE_3']], markers='*')
<seaborn.axisgrid.PairGrid at 0x7efcabd53b20>
print(len(df[df['EXT_SOURCE_1'].isna()]))
print(len(df[df['EXT_SOURCE_2'].isna()]))
print(len(df[df['EXT_SOURCE_3'].isna()]))
print(len(df[df['EXT_SOURCE_3'] <= 0.0005272653]))
print(len(df[df['EXT_SOURCE_3'] <= 0.0005272652]))
173378 660 60965 886 0
df[df['EXT_SOURCE_3'] <= 0.0005272653][['EXT_SOURCE_3']]
| EXT_SOURCE_3 | |
|---|---|
| 374 | 0.000527 |
| 970 | 0.000527 |
| 991 | 0.000527 |
| 1152 | 0.000527 |
| 1283 | 0.000527 |
| ... | ... |
| 305434 | 0.000527 |
| 305726 | 0.000527 |
| 306408 | 0.000527 |
| 306430 | 0.000527 |
| 306557 | 0.000527 |
886 rows × 1 columns
Purtroppo non c'è nessuna spiegazione di questi dati, che contengono un sacco di entrate pari a NaN. Inoltre EXT_SOURCE_3 ha 886 sospette entrate molto vicine a 0
%matplotlib inline
missingno.bar(fifth_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.matrix(fifth_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.heatmap(fifth_df)
<AxesSubplot: >
aux = my_vars[my_vars['Category'] == 5]
my_vars_vec = aux['Row'].to_list()
sns.histplot(df[my_vars_vec[0]])
<AxesSubplot: xlabel='APARTMENTS_AVG', ylabel='Count'>
sns.histplot(df[my_vars_vec[1]])
<AxesSubplot: xlabel='BASEMENTAREA_AVG', ylabel='Count'>
sns.histplot(df[my_vars_vec[2]])
<AxesSubplot: xlabel='YEARS_BEGINEXPLUATATION_AVG', ylabel='Count'>
sns.histplot(df[my_vars_vec[3]])
<AxesSubplot: xlabel='YEARS_BUILD_AVG', ylabel='Count'>
sns.histplot(df[my_vars_vec[4]])
<AxesSubplot: xlabel='COMMONAREA_AVG', ylabel='Count'>
sns.histplot(df[my_vars_vec[5]])
<AxesSubplot: xlabel='ELEVATORS_AVG', ylabel='Count'>
sns.histplot(df[my_vars_vec[6]])
<AxesSubplot: xlabel='ENTRANCES_AVG', ylabel='Count'>
sns.histplot(df[my_vars_vec[7]])
<AxesSubplot: xlabel='FLOORSMAX_AVG', ylabel='Count'>
print(data_descriptions[data_descriptions['Row'] == 'APARTMENTS_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'APARTMENTS_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'BASEMENTAREA_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'BASEMENTAREA_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'YEARS_BEGINEXPLUATATION_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'YEARS_BEGINEXPLUATATION_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'YEARS_BUILD_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'YEARS_BUILD_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'COMMONAREA_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'COMMONAREA_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'ELEVATORS_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'ELEVATORS_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'ENTRANCES_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'ENTRANCES_AVG']['Special'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'FLOORSMAX_AVG']['Description'].to_list())
print(data_descriptions[data_descriptions['Row'] == 'FLOORSMAX_AVG']['Special'].to_list())
['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor'] ['normalized'] ['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor'] ['normalized'] ['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor'] ['normalized'] ['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor'] ['normalized'] ['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor'] ['normalized'] ['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor'] ['normalized'] ['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor'] ['normalized'] ['Normalized information about building where the client lives, What is average (_AVG suffix), modus (_MODE suffix), median (_MEDI suffix) apartment size, common area, living area, age of building, number of elevators, number of entrances, state of the building, number of floor'] ['normalized']
sixth_df = fifth_df.copy()
sixth_df['FLAG_OWN_REALTY'] = df['FLAG_OWN_REALTY'].copy()
sixth_df['FLAG_OWN_REALTY'].replace(to_replace='N',value=np.nan, inplace=True)
%matplotlib inline
missingno.bar(sixth_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.matrix(sixth_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.heatmap(sixth_df)
<AxesSubplot: >
A quanto pare non c'è legame tra FLAG_OWN_REALTY e queste variabili relative al luogo di abitazione
sixth_df[sixth_df['FLAG_OWN_REALTY'].isna()]
| APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | FLAG_OWN_REALTY | |
|---|---|---|---|---|---|---|---|---|---|
| 1 | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 | NaN |
| 13 | 0.1474 | 0.0973 | 0.9806 | 0.7348 | 0.0582 | 0.16 | 0.1379 | 0.3333 | NaN |
| 16 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 22 | 0.0907 | 0.0795 | 0.9786 | 0.7076 | 0.0120 | 0.00 | 0.2069 | 0.1667 | NaN |
| 24 | 0.1433 | 0.1455 | 0.9861 | 0.8096 | 0.0212 | 0.00 | 0.3103 | 0.1667 | NaN |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 307494 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 307495 | 0.0742 | NaN | 0.9871 | NaN | 0.0136 | 0.08 | 0.0690 | 0.3333 | NaN |
| 307496 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 307506 | 0.2021 | 0.0887 | 0.9876 | 0.8300 | 0.0202 | 0.22 | 0.1034 | 0.6042 | NaN |
| 307510 | 0.0742 | 0.0526 | 0.9881 | NaN | 0.0176 | 0.08 | 0.0690 | 0.3750 | NaN |
94199 rows × 9 columns
Boh provo intanto a togliere tutte le righe in cui sono presenti tutti NaN relativi alle colonne relative al luogo di abitazione
df_vars_abitazione = data_descriptions[(data_descriptions['Unnamed: 0'] >= 47) & (data_descriptions['Unnamed: 0'] <= 93)]
vars_abitazione = df_vars_abitazione['Row'].to_list()
df_abitazione = df[vars_abitazione]
print(len(df_abitazione[df_abitazione.isna().all(axis=1)]))
df_abitazione = df_abitazione[df_abitazione.notna().any(axis=1)]
aux = my_vars[my_vars['Category'] == 5]
my_vars_vec = aux['Row'].to_list()
seventh_df = df_abitazione[my_vars_vec].copy()
seventh_df
145755
| APARTMENTS_AVG | BASEMENTAREA_AVG | YEARS_BEGINEXPLUATATION_AVG | YEARS_BUILD_AVG | COMMONAREA_AVG | ELEVATORS_AVG | ENTRANCES_AVG | FLOORSMAX_AVG | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0.0247 | 0.0369 | 0.9722 | 0.6192 | 0.0143 | 0.00 | 0.0690 | 0.0833 |
| 1 | 0.0959 | 0.0529 | 0.9851 | 0.7960 | 0.0605 | 0.08 | 0.0345 | 0.2917 |
| 12 | 0.0825 | NaN | 0.9811 | NaN | NaN | 0.00 | 0.2069 | 0.1667 |
| 13 | 0.1474 | 0.0973 | 0.9806 | 0.7348 | 0.0582 | 0.16 | 0.1379 | 0.3333 |
| 14 | 0.3495 | 0.1335 | 0.9985 | 0.9796 | 0.1143 | 0.40 | 0.1724 | 0.6667 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 307506 | 0.2021 | 0.0887 | 0.9876 | 0.8300 | 0.0202 | 0.22 | 0.1034 | 0.6042 |
| 307507 | 0.0247 | 0.0435 | 0.9727 | 0.6260 | 0.0022 | 0.00 | 0.1034 | 0.0833 |
| 307508 | 0.1031 | 0.0862 | 0.9816 | 0.7484 | 0.0123 | 0.00 | 0.2069 | 0.1667 |
| 307509 | 0.0124 | NaN | 0.9771 | NaN | NaN | NaN | 0.0690 | 0.0417 |
| 307510 | 0.0742 | 0.0526 | 0.9881 | NaN | 0.0176 | 0.08 | 0.0690 | 0.3750 |
161756 rows × 8 columns
Ci sono 145755 entrate in cui tutte le variabili relative all'abitazione di residenza (non solo quelle datemi dalla suddivisione) sono NaN
%matplotlib inline
missingno.bar(seventh_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.matrix(seventh_df, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.heatmap(seventh_df)
<AxesSubplot: >
Francamente ha senso su questo fare i plot anche considerando tutte le altre variabili relative all'abitazione
df_vars_abitazione = data_descriptions[(data_descriptions['Unnamed: 0'] >= 47) & (data_descriptions['Unnamed: 0'] <= 93)]
vars_abitazione = df_vars_abitazione['Row'].to_list()
df_abitazione = df[vars_abitazione]
print(len(df_abitazione[df_abitazione.isna().all(axis=1)]))
df_abitazione = df_abitazione[df_abitazione.notna().any(axis=1)]
145755
%matplotlib inline
missingno.bar(df_abitazione, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.matrix(df_abitazione, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.heatmap(df_abitazione)
<AxesSubplot: >
Vista la forte correlazione ha senso tenenere in considerazione unicamente le variabili AVG
df_vars_abitazione = data_descriptions[(data_descriptions['Unnamed: 0'] >= 47) & (data_descriptions['Unnamed: 0'] <= 60)]
vars_abitazione = df_vars_abitazione['Row'].to_list()
df_abitazione = df[vars_abitazione]
print(len(df_abitazione[df_abitazione.isna().all(axis=1)]))
df_abitazione = df_abitazione[df_abitazione.notna().any(axis=1)]
148810
%matplotlib inline
missingno.bar(df_abitazione, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.matrix(df_abitazione, color=(63/255,132/255,172/255))
<AxesSubplot: >
%matplotlib inline
missingno.heatmap(df_abitazione)
<AxesSubplot: >